In [ ]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import plotly.express as px
pd.options.plotting.backend = 'plotly'

from lec_utils import * 
# from save_data import * 

Your Title Here¶

Name(s): Phoebe Yi and Omkar nayak

Website Link: In progress

Step 1: Introduction¶

In [ ]:
base_path = Path("/Users/yipho/eecs398/portfolio/allyears")
output_path = Path("/Users/yipho/eecs398/portfolio/raw_data") 
output_path.mkdir(parents=True, exist_ok=True) 

years_to_process = range(2000, 2024)

def rename_case_id_to_respondent_id(df):
    if "CASEID" in df.columns and "RESPONDENT_ID" not in df.columns:
        df.rename(columns={"CASEID": "RESPONDENT_ID"}, inplace=True)
    return df

def load_and_save_icpsr_data(base_path, output_path, years_to_process):
    for year in years_to_process:
        print(f"Processing year {year}...")

        icpsr_folders = list(base_path.glob(f"ICPSR_*{year}")) 
        print(icpsr_folders)
        
        if not icpsr_folders:
            print(f"No ICPSR folder found for year {year}. Skipping...")
            continue

        merged_data= []  

        for folder in icpsr_folders:
            study_number = folder.name.split("_")[1].split("-")[0]  

            form1_path = folder / f"DS0001/{study_number}-0001-Data.dta"
            form6_path = folder / f"DS0006/{study_number}-0006-Data.dta"

            # Load and merge Form 1 and Form 6 if both exist
            if form1_path.exists() and form6_path.exists():
                # print(f"Found Form 1 and Form 6 data for year {year}.")
                try:
                    # Load Form 1
                    df1 = pd.read_stata(form1_path)
                    print(f"Loaded Form 1 with shape: {df1.shape}")
                    df1 = rename_case_id_to_respondent_id(df1)

                    # Load Form 6
                    df6 = pd.read_stata(form6_path)
                    print(f"Loaded Form 6 with shape: {df6.shape}")
                    df6 = rename_case_id_to_respondent_id(df6)

  
                    if "RESPONDENT_ID" in df1.columns and "RESPONDENT_ID" in df6.columns:
                        df_merged = df1.merge(df6, on="RESPONDENT_ID", how="inner")
                        print(f"Merged data shape: {df_merged.shape}")


                        df_merged["Year"] = year

                        merged_data.append(df_merged)
                    else:
                        print(f"'RESPONDENT_ID' column missing in Form 1 or Form 6 for year {year}. Skipping merge.")
                except Exception as e:
                    print(f"Error processing Form 1 and Form 6 for {year}: {e}")
            else:
                if not form1_path.exists():
                    print(f"Form 1 data not found for year {year}: {form1_path}")
                if not form6_path.exists():
                    print(f"Form 6 data not found for year {year}: {form6_path}")

        if merged_data:
            year_df = pd.concat(merged_data, axis=0)  
            output_file = output_path / f"ICPSR_data_{year}.csv"
            year_df.to_csv(output_file, index=False)
            print(f"Saved merged data for year {year} to {output_file}")
        else:
            print(f"No merged data found for year {year}.")


load_and_save_icpsr_data(base_path, output_path, years_to_process)
#god bless Kerby Shedden 

Step 2: Data Cleaning and Exploratory Data Analysis¶

In [8]:
import pandas as pd
from pathlib import Path

base_path = Path("/Users/yipho/eecs398/portfolio/raw_data")
output_path = Path("//Users/yipho/eecs398/portfolio/unprocessed_data") 
output_path.mkdir(parents=True, exist_ok=True)

variable_mapping = {
    "POL_BELIEFS": {
        (2000, 2023): "V5167",
    },
    "SEX" : {
        (2000, 2023): "V5150",
    },
    "NUM_SIBS": {
        (2000, 2023): "V49_x",
    }, 
    "BR_SR_inhouse": {
        (2000, 2011): "V157",
        (2012, 2023): "V2157",
    },
    "FATHR_PRES": {
        (2000, 2023): "V5155",
    },
    "MOTHR_PRES": {
        (2000, 2023): "V5156",
    },
    "LONELY": {
        (2000, 2023): "V5313", 
    },
    "WISH_MORE_FRNDS": {
        (2000, 2023): "V5321",  
    },
    "USLLY_FRNDS": {
        (2000, 2023): "V5324",  
    },
}

def get_variable_for_year(variable_name, year):
    for year_range, var in variable_mapping[variable_name].items():
        if year_range[0] <= year <= year_range[1]:
            return var
    return None

def rename_variables(df, year):
    renamed_columns = {}

    for logic_name, year_mapping in variable_mapping.items():
        column_name = get_variable_for_year(logic_name, year)
        if column_name and column_name in df.columns:
            renamed_columns[column_name] = logic_name

    df = df.rename(columns=renamed_columns)
    print(f"Renamed columns for year {year}: {renamed_columns}")
    return df

def clean_and_process_data(df, year):
    df = rename_variables(df, year)

    cols_interest = [
        "RESPONDENT_ID",
        "V1_x", 
        "SEX",  
        "POL_BELIEFS",  
        "NUM_SIBS", 
        "BR_SR_inhouse", 
        "FATHR_PRES",  
        "MOTHR_PRES",  
        "LONELY",  
        "WISH_MORE_FRNDS",  
        "USLLY_FRNDS", 
    ]

    cols_interest = [col for col in cols_interest if col in df.columns]  
    dfmain = df[cols_interest]

    # Clean variables where needed
    cols_clean = ["NUM_SIBS", "SEX", "POL_BELIEFS", "BR_SR_inhouse", "FATHR_PRES", "MOTHR_PRES", "LONELY", "WISH_MORE_FRNDS", "USLLY_FRNDS"]
    cols_clean = [col for col in cols_clean if col in df.columns]


    # def extract_number(column):
    #     pattern = r".+:\s*\((\d+)\)"
    #     return column.apply(lambda x: int(re.match(pattern, str(x)).group(1)) if re.match(pattern, str(x)) else None)
    
    # for col in cols_clean:
    #     dfmain[col] = extract_number(dfmain[col])
    def extract_number(column):
    
        pattern1 = r".+:\s*\((-?\d+)\)"  
        pattern2 = r"(\d+)"            

        def parse_value(value):
            value_str = str(value).strip()
            
            if re.match(pattern1, value_str):
                return int(re.match(pattern1, value_str).group(1))
            elif re.match(pattern2, value_str):
                return int(re.match(pattern2, value_str).group(1))
            else:
                return None

        return column.apply(parse_value)
    
    for col in cols_clean:
        dfmain[col] = extract_number(dfmain[col])
    
  
    if "POL_BELIEFS" in dfmain.columns:
        dfmain = dfmain.dropna(subset=["POL_BELIEFS"])

    # Drop invalid values for SEX
    if "SEX" in dfmain.columns:
        dfmain = dfmain.dropna(subset=["SEX"])

    if 'NUM_SIBS' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['NUM_SIBS'])

    if 'BR_SR_inhouse' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['BR_SR_inhouse'])

    if 'FATHR_PRES' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['FATHR_PRES'])
    
    if 'MOTHR_PRES' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['MOTHR_PRES'])

    if 'LONELY' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['LONELY'])
        dfmain['LONELY'] = dfmain['LONELY'].astype(int)
    #if wish more friends missing
    if 'WISH_MORE_FRNDS' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['WISH_MORE_FRNDS'])
    #if usually friends missing
    if 'USLLY_FRNDS' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['USLLY_FRNDS'])

    return dfmain

# Process each CSV file based on year
def process_raw_data(base_path, output_path):
    for csv_file in base_path.glob("ICPSR_data_*.csv"):
        try:
            # Extract year from file name
            year = int(csv_file.stem.split("_")[-1])
            # print(f"Processing file for year {year}: {csv_file}")

            # Load data
            df = pd.read_csv(csv_file)
            # print(f"Loaded data with shape: {df.shape}")

            # Process data
            df_processed = clean_and_process_data(df, year)
            # print(f"Processed data shape: {df_processed.shape}")

            # Save processed data
            output_file = output_path / f"data_{year}.csv"
            df_processed.to_csv(output_file, index=False)
            print(f"Saved processed data for year {year} to {output_file}")

        except Exception as e:
            print(f"Error processing file {csv_file}: {e}")

process_raw_data(base_path, output_path)
Renamed columns for year 2013: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2013 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2013.csv
Renamed columns for year 2007: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2007 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2007.csv
Renamed columns for year 2006: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2006 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2006.csv
Renamed columns for year 2012: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2012 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2012.csv
Renamed columns for year 2004: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2004 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2004.csv
Renamed columns for year 2010: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2010 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2010.csv
Renamed columns for year 2011: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2011 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2011.csv
Renamed columns for year 2005: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2005 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2005.csv
Renamed columns for year 2001: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2001 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2001.csv
Renamed columns for year 2015: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2015 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2015.csv
Renamed columns for year 2014: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2014 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2014.csv
Renamed columns for year 2000: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2000 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2000.csv
Renamed columns for year 2016: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2016 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2016.csv
Renamed columns for year 2002: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2002 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2002.csv
Renamed columns for year 2003: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2003 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2003.csv
Renamed columns for year 2017: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2017 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2017.csv
Renamed columns for year 2019: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2019 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2019.csv
Renamed columns for year 2018: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2018 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2018.csv
Renamed columns for year 2008: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2008 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2008.csv
Renamed columns for year 2020: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2020 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2020.csv
Renamed columns for year 2021: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2021 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2021.csv
Renamed columns for year 2009: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2009 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2009.csv
Renamed columns for year 2023: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2023 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2023.csv
Renamed columns for year 2022: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Saved processed data for year 2022 to //Users/yipho/eecs398/portfolio/unprocessed_data/data_2022.csv
In [84]:
base_path = Path("/Users/yipho/eecs398/portfolio/raw_data")
output_path = Path("/Users/yipho/eecs398/portfolio/processed_data") 
output_path.mkdir(parents=True, exist_ok=True)

variable_mapping = {
    "POL_BELIEFS": {
        (2000, 2023): "V5167",
    },
    "SEX" : {
        (2000, 2023): "V5150",
    },
    "NUM_SIBS": {
        (2000, 2023): "V49_x",
    }, 
    "BR_SR_inhouse": {
        (2000, 2011): "V157",
        (2012, 2023): "V2157",
    },
    "FATHR_PRES": {
        (2000, 2023): "V5155",
    },
    "MOTHR_PRES": {
        (2000, 2023): "V5156",
    },
    "LONELY": {
        (2000, 2023): "V5313", 
    },
    "WISH_MORE_FRNDS": {
        (2000, 2023): "V5321",  
    },
    "USLLY_FRNDS": {
        (2000, 2023): "V5324",  
    },
}

def get_variable_for_year(variable_name, year):
    for year_range, var in variable_mapping[variable_name].items():
        if year_range[0] <= year <= year_range[1]:
            return var
    return None

def rename_variables(df, year):
    renamed_columns = {}

    for logic_name, year_mapping in variable_mapping.items():
        column_name = get_variable_for_year(logic_name, year)
        if column_name and column_name in df.columns:
            renamed_columns[column_name] = logic_name

    df = df.rename(columns=renamed_columns)
    print(f"Renamed columns for year {year}: {renamed_columns}")
    return df

def clean_and_process_data(df, year):
    df = rename_variables(df, year)

    cols_interest = [
        "RESPONDENT_ID",
        "V1_x", 
        "SEX",  
        "POL_BELIEFS",  
        "NUM_SIBS", 
        "BR_SR_inhouse", 
        "FATHR_PRES",  
        "MOTHR_PRES",  
        "LONELY",  
        "WISH_MORE_FRNDS",  
        "USLLY_FRNDS", 
    ]

    cols_interest = [col for col in cols_interest if col in df.columns]  
    dfmain = df[cols_interest]

    # Clean variables where needed
    cols_clean = ["NUM_SIBS", "SEX", "POL_BELIEFS", "BR_SR_inhouse", "FATHR_PRES", "MOTHR_PRES", "LONELY", "WISH_MORE_FRNDS", "USLLY_FRNDS"]
    cols_clean = [col for col in cols_clean if col in df.columns]


    # def extract_number(column):
    #     pattern = r".+:\s*\((\d+)\)"
    #     return column.apply(lambda x: int(re.match(pattern, str(x)).group(1)) if re.match(pattern, str(x)) else None)
    
    # for col in cols_clean:
    #     dfmain[col] = extract_number(dfmain[col])
    def extract_number(column):
    
        pattern1 = r".+:\s*\((-?\d+)\)"  
        pattern2 = r"(\d+)"            

        def parse_value(value):
            value_str = str(value).strip()
            
            if re.match(pattern1, value_str):
                return int(re.match(pattern1, value_str).group(1))
            elif re.match(pattern2, value_str):
                return int(re.match(pattern2, value_str).group(1))
            else:
                return None

        return column.apply(parse_value)
    
    for col in cols_clean:
        dfmain[col] = extract_number(dfmain[col])
    
    
  
    if "POL_BELIEFS" in dfmain.columns:
        dfmain = dfmain[dfmain["POL_BELIEFS"].isin([6, 8, -9]) == False]
        dfmain = dfmain.dropna(subset=["POL_BELIEFS"])
        dfmain["POL_BELIEFS"] = dfmain["POL_BELIEFS"] - 1  # Rescale to start from 0

    # Drop invalid values for SEX
    if "SEX" in dfmain.columns:
        dfmain = dfmain[dfmain["SEX"].isin([-9, 3, 4]) == False]
        dfmain = dfmain.dropna(subset=["SEX"])
        dfmain["SEX"] = dfmain["SEX"] - 1  # 0 for male, 1 for female

    if 'NUM_SIBS' in dfmain.columns:
        dfmain = dfmain[dfmain["NUM_SIBS"].isin([-9]) == False]
        dfmain = dfmain.dropna(subset=['NUM_SIBS'])

    if 'BR_SR_inhouse' in dfmain.columns:
        dfmain = dfmain[dfmain["BR_SR_inhouse"].isin([-9]) == False]
        dfmain = dfmain.dropna(subset=['BR_SR_inhouse'])

    if 'FATHR_PRES' in dfmain.columns:
        dfmain = dfmain[dfmain["FATHR_PRES"].isin([-9]) == False]
        dfmain = dfmain.dropna(subset=['FATHR_PRES'])
    
    if 'MOTHR_PRES' in dfmain.columns:
        dfmain = dfmain[dfmain["MOTHR_PRES"].isin([-9]) == False]
        dfmain = dfmain.dropna(subset=['MOTHR_PRES'])

    #if lonely missing 
    if 'LONELY' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['LONELY'])
        dfmain = dfmain[dfmain["LONELY"].isin([-9]) == False]
        dfmain['LONELY'] = dfmain['LONELY'].astype(int)
        dfmain['LONELY'] = dfmain['LONELY'] - 1
    #if wish more friends missing
    if 'WISH_MORE_FRNDS' in dfmain.columns:
        dfmain = dfmain[dfmain["WISH_MORE_FRNDS"].isin([-9]) == False]
        dfmain = dfmain.dropna(subset=['WISH_MORE_FRNDS'])
        dfmain['WISH_MORE_FRNDS'] = dfmain['WISH_MORE_FRNDS'] - 1
    #if usually friends missing
    if 'USLLY_FRNDS' in dfmain.columns:
        #drop -9 
        dfmain = dfmain[dfmain["USLLY_FRNDS"].isin([-9]) == False]
        dfmain = dfmain.dropna(subset=['USLLY_FRNDS'])
        dfmain['USLLY_FRNDS'] = dfmain['USLLY_FRNDS'] - 1

    return dfmain

# Process each CSV file based on year
def process_raw_data(base_path, output_path):
    for csv_file in base_path.glob("ICPSR_data_*.csv"):
        try:
            # Extract year from file name
            year = int(csv_file.stem.split("_")[-1])
            # print(f"Processing file for year {year}: {csv_file}")

            # Load data
            df = pd.read_csv(csv_file)
            # print(f"Loaded data with shape: {df.shape}")

            # Process data
            df_processed = clean_and_process_data(df, year)
            # print(f"Processed data shape: {df_processed.shape}")

            # Save processed data
            output_file = output_path / f"data_{year}.csv"
            df_processed.to_csv(output_file, index=False)
            # print(f"Saved processed data for year {year} to {output_file}")

        except Exception as e:
            print(f"Error processing file {csv_file}: {e}")


process_raw_data(base_path, output_path)
Renamed columns for year 2013: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2007: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2006: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2012: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2004: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2010: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2011: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2005: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2001: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2015: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2014: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2000: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2016: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2002: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2003: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2017: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2019: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2018: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2008: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2020: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2021: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2009: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2023: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2022: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
In [85]:
def avg_loneliness(df, year): 
    #scaling from 0 to 1 
    #divide all values by 4
    df['LONELY'] = (df['LONELY']- 1) /4
    #add all values together then divide by the number of values
    return df['LONELY'].mean()

def avg_pol_beliefs(df, year): 
    #scaling from 0 to 1
    #divide all values by 4
    df['POL_BELIEFS'] = (df['POL_BELIEFS'] - 1) /4
    #add all values together then divide by the number of values
    return df['POL_BELIEFS'].mean()


def avg_sibling_count(df, year): 
    return df["NUM_SIBS"].mean()

def avg_wish_frnds(df,year):
    df['WISH_MORE_FRNDS'] = (df['WISH_MORE_FRNDS'] - 1)/4
    return df['WISH_MORE_FRNDS'].mean()

def avg_uslly_frnds(df,year):
    df['USLLY_FRNDS'] = (df['USLLY_FRNDS']- 1)/4
    return df['USLLY_FRNDS'].mean()

def avg_fathr_pres(df,year):
    return df['FATHR_PRES'].mean()

def avg_mothr_pres(df,year):
    return df['MOTHR_PRES'].mean()
#boolean to a numeric value for father and pres 
In [86]:
base_path = Path("/Users/yipho/eecs398/portfolio/processed_data")
data = {
    "Year": [],
    "Average Loneliness": [],
    "Average Political Beliefs": [],
    "Average Sibling Count": [],
    "Average Wish More Friends": [],
    "Average Usually Friends": [],
    "Average Father Presence": [],
    "Average Mother Presence": [],
}

for year in range(2000, 2024):
    path = base_path / f"data_{year}.csv"
    df = pd.read_csv(path)
    
    data["Year"].append(year)
    data["Average Loneliness"].append(avg_loneliness(df, year))
    data["Average Political Beliefs"].append(avg_pol_beliefs(df, year))
    data["Average Sibling Count"].append(avg_sibling_count(df, year))
    data["Average Wish More Friends"].append(avg_wish_frnds(df, year))
    data["Average Usually Friends"].append(avg_uslly_frnds(df, year))
    data["Average Father Presence"].append(avg_fathr_pres(df, year))
    data["Average Mother Presence"].append(avg_mothr_pres(df, year))


summary_df = pd.DataFrame(data)
summary_df

variables = [
    "Average Loneliness", 
    "Average Political Beliefs", 
    "Average Sibling Count", 
    "Average Wish More Friends", 
    "Average Usually Friends"
]

for var in variables:
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=summary_df["Year"],
        y=summary_df[var],
        mode='lines+markers',
        name=var
    ))
    
    # Update layout
    fig.update_layout(
        title=f"{var} Over Time (2000-2023)",
        xaxis_title="Year",
        yaxis_title=var,
        template="plotly_white"
    )

    fig.show("notebook")
In [87]:
base_path = Path("/Users/yipho/eecs398/portfolio/unprocessed_data")

# drop -9 in lonely and pol_belief
def clean_data(df):
    df = df[df["LONELY"].isin([-9, 8]) == False]
    df = df[df["POL_BELIEFS"].isin([-9, 8]) == False]
    return df

data = {
    "Year": [],
    "Average Loneliness": [],
    "Average Political Beliefs": [],
    "Average Sibling Count": [],
    "Average Wish More Friends": [],
    "Average Usually Friends": [],
    "Average Father Presence": [],
    "Average Mother Presence": [],
}

for year in range(2000, 2024):
    path = base_path / f"data_{year}.csv"
    df = pd.read_csv(path)
    df = clean_data(df)
    
    data["Year"].append(year)
    data["Average Loneliness"].append(avg_loneliness(df, year))
    data["Average Political Beliefs"].append(avg_pol_beliefs(df, year))
    data["Average Sibling Count"].append(avg_sibling_count(df, year))
    data["Average Wish More Friends"].append(avg_wish_frnds(df, year))
    data["Average Usually Friends"].append(avg_uslly_frnds(df, year))
    data["Average Father Presence"].append(avg_fathr_pres(df, year))
    data["Average Mother Presence"].append(avg_mothr_pres(df, year))


summary_df = pd.DataFrame(data)

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=summary_df["Year"],
    y=summary_df["Average Political Beliefs"],
    mode='lines+markers',
    name="Average Political Beliefs"
))

fig.add_trace(go.Scatter(
    x=summary_df["Year"],
    y=summary_df["Average Loneliness"],
    mode='lines+markers',
    name="Average Loneliness"
))

fig.update_layout(
    title="Trends in Average Political Beliefs and Loneliness Over Time (2000-2023)",
    xaxis_title="Year",
    yaxis_title="Scaled Value",
    template="plotly_white",
    legend_title="Variables",
    yaxis=dict(range=[0, 1])  # Set y-axis range from 0 to 1
)

fig.show("notebook")
In [88]:
df23 = pd.read_csv("processed_data/data_2023.csv")
pivot_table = pd.crosstab(df23["LONELY"], df23["POL_BELIEFS"])
pivot_table
Out[88]:
POL_BELIEFS 0 1 2 3 4
LONELY
0 16 35 32 9 4
1 7 28 53 40 17
2 17 34 46 45 17
3 11 34 39 46 18
4 9 21 33 36 9

SECOND PLOT, BIVAR ANALYSIS 1¶

In [89]:
fig = px.density_heatmap(
    df23,
    x='POL_BELIEFS',
    y='LONELY',
    color_continuous_scale='Viridis',
    title="Bivariate Analysis of Political Beliefs and Loneliness (Heatmap Example)",
)
fig.show("notebook")

THIRD PLOT, BIVAR ANALYSIS 2¶

In [90]:
fig = px.box(df23, x = "SEX", y = "USLLY_FRNDS", title = "Boxplot", labels = {"SEX": "Sex", "USLLY_FRNDS": "Consistent Group of Friends?"})
fig.show("notebook")

Interesting Aggregates!¶

In [91]:
pivot_table = pd.crosstab(df23['LONELY'], df23['WISH_MORE_FRNDS'])
pivot_table
Out[91]:
WISH_MORE_FRNDS 0 1 2 3 4
LONELY
0 48 18 12 10 8
1 37 45 14 32 17
2 23 29 48 39 20
3 21 28 15 50 34
4 12 11 7 25 53

Step 3: Framing a Prediction Problem¶

From the previous sections (the dual lineplot), it is clear that there is some correlation or convergence between how lonely people feel and their political leaning. Due to this, we want to explore any possible way to predict a student's political leaning using these indicators. This naturally leads us to explore classification algorithms and how we may use multiclass classification to identify a student's political disposition.

Formally, we are trying to use relevant variables that indicate the state of a responder's social network, based on a survey, to train a multiclass classification model. On the micro scale, the algorithm would allow us to predict the political leaning of a single respondent, but on the macro scale, we can observe the sentiment of the entire class as a whole. In order to see how the overall sentiment of 12th graders changes over time, we must focus on the macro scale. Thus, our prediction problem is as follows: Can we predict the overall political leaning of the class of 12th graders based on each individual's social network state?

Because the data is constructed in a way that prevents overlaps of conflicting categorical data points, we don't need to modify our cleaned data for the purposes of the baseline model. In line with best model-building practices, we will use a 70-30 split, with 70% for training and the remaining 30% for testing. Since we are using a multiclass classification model, we will evaluate performance using accuracy and the F1 score for simplicity and clarity.

Step 4: Baseline Model¶

In [92]:
# import all the necessary tools 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics  

Building the baseline Model¶

In [93]:
# TODO
def getresbasic(year):
    # 1. Load the processed data for all years
    try:
        dfpred = pd.read_csv(f"processed_data/data_{year}.csv")
    except:
        print(f"Error loading data for year {year}")
        return None
    
    # get the right data that we need 
    goal = dfpred["POL_BELIEFS"]
    Pred = dfpred[["BR_SR_inhouse","LONELY","WISH_MORE_FRNDS"]]


    # 2. Split the data
    X_train, X_test, y_train, y_test = train_test_split(Pred,goal , random_state=100,  test_size=0.20, shuffle=True) 
    
    # 3. Train the model
    rf = RandomForestClassifier(n_estimators = 100) 
    rf.fit(X_train, y_train)
    
    # 4. Predict stuff
    y_pred = rf.predict(X_test)
    
    # 5. Evaluate the model
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("F1 Score:",metrics.f1_score(y_test, y_pred, average='weighted'))
    
    # Return nothing because we won't use anything form this model 
    return None 
    

Step 5: Final Model¶

In [57]:
#Build the Final Model 
def getresfinal(year):
    # try to load in the cleaned data
    path =  f'/Users/yipho/eecs398/portfolio/unprocessed_data/data_{year}.csv'
    try:
        dfpred = pd.read_csv(path)
    except:
        print(f"Error loading data for year {year}")
        return None
    
    #subset all the invalid POL_BELIEFS data 
    
    dfprednew =  dfpred[dfpred["POL_BELIEFS"].isin([6, 8, -9]) == True]
    dfpred = dfpred[dfpred["POL_BELIEFS"].isin([6, 8, -9]) == False]

    # Update the columns to be used in the model
    #Parents Collumn 
    def make_parents(df):
        df["PARENTS_PRES"] = df["MOTHR_PRES"] + df["FATHR_PRES"]
        df = df.drop(columns=["MOTHR_PRES", "FATHR_PRES"])
        return df   
    
    def onehot(df):
        possible_values = [1, 2, 3, 4, 5]
        dummies = pd.get_dummies(df['LONELY'], prefix="LONELY")

        for value in possible_values:
            column_name = f"LONELY_{value}"
            if column_name not in dummies.columns:
                dummies[column_name] = False 
        
        df = pd.concat([df, dummies], axis=1)
        
        return df
    
    dfpred = make_parents(dfpred)
    dfpred = onehot(dfpred)

    maingoal = dfpred["POL_BELIEFS"]
    Pred = dfpred[["BR_SR_inhouse","LONELY_1","LONELY_2","LONELY_3","LONELY_4","LONELY_5","WISH_MORE_FRNDS",
                   "USLLY_FRNDS","NUM_SIBS","PARENTS_PRES","SEX"]]
    

    # 2. Split the data
    X_train, X_test, y_train, y_test = train_test_split(Pred, maingoal, random_state=100, test_size=0.20, shuffle=True) 
    

    # Set Grid Search Parameters
    
    param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, 40, 50]}
    rf = RandomForestClassifier(random_state=78)
    
    # 3. Train the model
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10, scoring='accuracy') 
    grid_search.fit(X_train, y_train)
    
    best_rf = grid_search.best_estimator_
    best_rf.fit(Pred, maingoal)
    
    y_pred = best_rf.predict(X_test)
    

    Accuracy = metrics.accuracy_score(y_test, y_pred)
    F1 = metrics.f1_score(y_test, y_pred, average='weighted')
    
    print("Accuracy:", Accuracy)
    print("F1 Score:", F1)
    

    # Predict for the invalid data
    dfprednew = make_parents(dfprednew)
    dfprednew = onehot(dfprednew)

    Pred = dfprednew[["BR_SR_inhouse","LONELY_1","LONELY_2","LONELY_3","LONELY_4","LONELY_5","WISH_MORE_FRNDS",
                   "USLLY_FRNDS","NUM_SIBS","PARENTS_PRES","SEX"]]
    
    y_pred = best_rf.predict(Pred)

    # Add the new y_pred to the original goal 
    Total = pd.concat([maingoal, pd.Series(y_pred)], axis=0)

    # Find the avg total    
    return Total.mean(), year, Accuracy, F1
In [59]:
from sklearn.multioutput import MultiOutputClassifier

meanarr = []
yeararr = []
Accarr = []
F1arr = []

for year in range(2000, 2024):
    try:
        mean, year, Acc, F1 = getresfinal(year)
        meanarr.append(mean)
        yeararr.append(year)
        Accarr.append(Acc)
        F1arr.append(F1)
    except KeyError as e:
        print(f"KeyError for year {year}: {e}")
        continue

meanarr = np.array(meanarr)
yeararr = np.array(yeararr)
Accarr = np.array(Accarr)   
F1arr = np.array(F1arr)

FinalRes = pd.DataFrame({"Year": yeararr, "Mean": meanarr, "Accuracy": Accarr, "F1": F1arr})
Accuracy: 0.7718446601941747
F1 Score: 0.7506777424960853
Accuracy: 0.7894736842105263
F1 Score: 0.783077357747177
Accuracy: 0.7916666666666666
F1 Score: 0.7842150422475913
Accuracy: 0.7093023255813954
F1 Score: 0.6903324576730157
Accuracy: 0.7316176470588235
F1 Score: 0.7159800786023385
Accuracy: 0.6915254237288135
F1 Score: 0.6834058311804885
Accuracy: 0.7269372693726938
F1 Score: 0.7208770250081299
Accuracy: 0.6635220125786163
F1 Score: 0.644642247097599
Accuracy: 0.760797342192691
F1 Score: 0.7489117527052044
Accuracy: 0.6923076923076923
F1 Score: 0.6786650420458517
Accuracy: 0.7362637362637363
F1 Score: 0.7292266466363624
Accuracy: 0.8154761904761905
F1 Score: 0.8135929980466247
Accuracy: 0.75
F1 Score: 0.7396549453851684
Accuracy: 0.7177914110429447
F1 Score: 0.708349825186358
Accuracy: 0.7142857142857143
F1 Score: 0.7167513144341543
Accuracy: 0.7243589743589743
F1 Score: 0.7251534199990741
Accuracy: 0.7388535031847133
F1 Score: 0.7366251003298095
Accuracy: 0.7341040462427746
F1 Score: 0.723464871537565
Accuracy: 0.7388888888888889
F1 Score: 0.7350285361208926
Accuracy: 0.7771084337349398
F1 Score: 0.7767850983164013
Accuracy: 0.8571428571428571
F1 Score: 0.8557182613305061
Accuracy: 0.7368421052631579
F1 Score: 0.7336869478810817
Accuracy: 0.8012422360248447
F1 Score: 0.7985109614522304
Accuracy: 0.7902097902097902
F1 Score: 0.7876139688329625
In [94]:
reFinRes = FinalRes
reFinRes
reFineRes1 = pd.DataFrame(reFinRes)
reFineRes1
Out[94]:
Year Mean Accuracy F1
0 2000 3.04 0.77 0.75
1 2001 3.03 0.79 0.78
2 2002 3.12 0.79 0.78
... ... ... ... ...
21 2021 2.99 0.74 0.73
22 2022 3.25 0.80 0.80
23 2023 3.04 0.79 0.79

24 rows × 4 columns

In [95]:
reFineRes1['Mean'] = (reFineRes1['Mean'] - 1) /4

fig = px.line(reFineRes1.reset_index(), x="Year", y="Mean", 
              title="Mean of Political Beliefs Over Time",
              labels={"Mean": "Mean of Political Beliefs", "Year": "Year"})


fig.update_yaxes(range=[0, 1])  
fig.update_yaxes(tickvals=[0, 0.5, 1], 
                 ticktext=["Conservative", "Moderate", "Liberal"])  

fig.show("notebook")
In [96]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=summary_df["Year"],
    y=summary_df["Average Political Beliefs"],
    mode='lines+markers',
    name="Actual Mean of Political Beliefs"
))

fig.add_trace(go.Scatter(
    x=reFineRes1["Year"],
    y=reFineRes1["Mean"],
    mode='lines+markers',
    name="Predicted Mean of Political Beliefs"
))

fig.update_layout(
    title="Actual vs. Predicted Mean of Political Beliefs Over Time",
    xaxis_title="Year",
    yaxis_title="Mean of Political Beliefs",
    yaxis=dict(range=[0, 1]), 
    template="plotly_white",
    legend_title="Metrics"
)

fig.show("notebook")
In [ ]: